Setup

Parameters

print(params)
## $gene.metadata.path
## [1] "../../data/gene_metadata"
## 
## $input.samples.rds.path
## [1] "../../results/01_Filter_Samples"
## 
## $input.variability.rds.path
## [1] "../../results/02_Variability_Jackknife"
## 
## $output.rds.path
## [1] "../../results/03_Variability_Plots"
## 
## $species
## [1] "ELU"
## 
## $species.name
## [1] "Esox lucius"
## 
## $min.percentile
## [1] 0
## 
## $max.percentile
## [1] 0.95
## 
## $all.nonzero.matrix
## [1] TRUE

Libraries

library(dplyr)

Functions

source("../functions/helper_functions.R")
source("../functions/variability_jackknife.R")
source("../functions/variability_plots.R")

Session Info

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 11.7.8
## 
## Matrix products: default
## LAPACK: /Users/cbucao/miniforge3/envs/fish-ev-gene-function/lib/libopenblasp-r0.3.25.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] cowplot_1.1.1        ggpubr_0.4.0         slider_0.2.2         viridis_0.6.3        viridisLite_0.4.1   
##  [6] gplots_3.1.3         ggplot2_3.3.6        matrixStats_0.62.0   edgeR_3.34.1         limma_3.48.3        
## [11] colorBlindness_0.1.9 dplyr_1.0.9         
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.6          tidyr_1.2.1         splines_4.1.0       jsonlite_1.8.9      carData_3.0-5      
##  [6] warp_0.2.0          gtools_3.9.4        bslib_0.5.0         BiocManager_1.30.21 highr_0.10         
## [11] yulab.utils_0.1.8   renv_0.17.3         yaml_2.3.7          pillar_1.7.0        backports_1.4.1    
## [16] lattice_0.21-8      glue_1.8.0          digest_0.6.37       ggsignif_0.6.4      colorspace_2.1-0   
## [21] Matrix_1.5-4.1      htmltools_0.5.5     pkgconfig_2.0.3     broom_1.0.5         purrr_0.3.5        
## [26] tidytree_0.4.6      scales_1.2.1        tibble_3.1.7        mgcv_1.8-42         generics_0.1.3     
## [31] farver_2.1.1        car_3.1-2           ellipsis_0.3.2      cachem_1.0.8        withr_3.0.2        
## [36] lazyeval_0.2.2      cli_3.6.3           magrittr_2.0.3      crayon_1.5.3        evaluate_1.0.1     
## [41] fs_1.6.5            fansi_1.0.4         MASS_7.3-58.3       nlme_3.1-162        rstatix_0.7.2      
## [46] tools_4.1.0         lifecycle_1.0.4     munsell_0.5.0       locfit_1.5-9.7      isoband_0.2.7      
## [51] compiler_4.1.0      jquerylib_0.1.4     caTools_1.18.2      gridGraphics_0.5-1  rlang_1.1.4        
## [56] grid_4.1.0          bitops_1.0-7        labeling_0.4.2      rmarkdown_2.22      gtable_0.3.3       
## [61] abind_1.4-5         DBI_1.1.3           R6_2.5.1            gridExtra_2.3       knitr_1.43         
## [66] fastmap_1.1.1       utf8_1.2.3          treeio_1.16.2       KernSmooth_2.23-21  ape_5.8            
## [71] Rcpp_1.0.10         vctrs_0.4.1         tidyselect_1.2.0    xfun_0.39

Data

Metadata

biotype <- 
  read.delim(file.path(params$gene.metadata.path, 
                       paste(params$species, "biotype_Ensembl_105.txt", sep="_")), 
             sep="\t")

Processed data

# Load saved data from 01_Filter_Samples.Rmd
# Normalized by condition
if (!params$all.nonzero.matrix) {
  filtered.samples <- readRDS(
    file.path(params$input.samples.rds.path,
              paste(params$species, "4_reps_data.conditions.rds", sep="_")))
} else {
  filtered.samples <- readRDS(
    file.path(params$input.samples.rds.path,
              paste(params$species, "4_reps_data.conditions.nonzero.rds", sep="_")))  
}

Tissue <- as.factor(filtered.samples$metadata$Tissue)
if ("Sex" %in% colnames(filtered.samples$metadata)) { 
  Sex <- as.factor(filtered.samples$metadata$Sex) 
}

# Load jackknifed expression variability estimates from 02_Variability_Jackknife.Rmd
## Adjusted SD
if (!params$all.nonzero.matrix) {
  jack.adj.sd <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.adj.sd.rds", sep="_")))

  ## Residual SD
  jack.resid.sd <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.resid.sd.rds", sep="_")))

  ## Residual CV / Local (residual) CV
  jack.resid.cv <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.resid.cv.rds", sep="_")))  
} else {
  jack.adj.sd <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.adj.sd.nonzero.rds", sep="_")))

  ## Residual SD
  jack.resid.sd <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.resid.sd.nonzero.rds", sep="_")))

  ## Residual CV / Local (residual) CV
  jack.resid.cv <- readRDS(
    file.path(params$input.variability.rds.path,
              paste(params$species, "jack.resid.cv.nonzero.rds", sep="_")))   
}

Main

Initial summary statistics

if ("Sex" %in% colnames(filtered.samples$metadata)) {
  summary.stats <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        compute_gene_summary_stats(filtered.samples$log2.tmm.cpm.gf[[t]][[s]])
    })
  })
  
  # Remove top and bottom x% of genes by expression level
  summary.stats <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        summary.stats[[t]][[s]][
          summary.stats[[t]][[s]]$Rank_Mean>params$min.percentile &
            summary.stats[[t]][[s]]$Rank_Mean<params$max.percentile,]
      })
    })
  
  # Remove missing conditions
  summary.stats <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        summary.stats[[t]][[s]][complete.cases(summary.stats[[t]][[s]]),]
      })
    })
  
} else {
  summary.stats <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      compute_gene_summary_stats(filtered.samples$log2.tmm.cpm.gf[[t]])
    })
  
  # Remove top and bottom x% of genes by expression level
  summary.stats <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
        summary.stats[[t]][
          summary.stats[[t]]$Rank_Mean>params$min.percentile &
            summary.stats[[t]]$Rank_Mean<params$max.percentile,]
    })
}

Plots

SD x Mean

plot_sd_vs_mean(summary.stats, filtered.samples$metadata)

Log2 SD x Mean

plot_sd_vs_mean(summary.stats, filtered.samples$metadata, log2=TRUE)

CV vs Mean

plot_cv_vs_mean(summary.stats, filtered.samples$metadata, log2=FALSE)

Log2 (CV^2) vs Mean

plot_cv_vs_mean(summary.stats, filtered.samples$metadata, log2=TRUE)

Jackknife

Summarized results

# Adjusted SD
if ("Sex" %in% colnames(filtered.samples$metadata)) { 
  jack.adj.sd.summary <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        if (!is.null(jack.adj.sd[[t]][[s]])) { 
          jack.adj.sd[[t]][[s]]$summary %>%
            tibble::rownames_to_column(., var="GeneID")
        }
      })
    })
} else { 
  jack.adj.sd.summary <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.adj.sd[[t]]$summary %>%
        tibble::rownames_to_column(., var="GeneID")
      })
}

# Residual SD
if ("Sex" %in% colnames(filtered.samples$metadata)) { 
  jack.resid.sd.summary <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        if (!is.null(jack.resid.sd[[t]][[s]])) { 
          jack.resid.sd[[t]][[s]]$summary %>%
            tibble::rownames_to_column(., var="GeneID")
        }
      })
    })
} else { 
  jack.resid.sd.summary <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.resid.sd[[t]]$summary %>%
        tibble::rownames_to_column(., var="GeneID")
      })
}

# Residual CV
if ("Sex" %in% colnames(filtered.samples$metadata)) { 
  jack.resid.cv.summary <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        if (!is.null(jack.resid.cv[[t]][[s]])) { 
          jack.resid.cv[[t]][[s]]$summary %>%
            tibble::rownames_to_column(., var="GeneID")
        }
      })
    })
} else { 
  jack.resid.cv.summary <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.resid.cv[[t]]$summary %>%
        tibble::rownames_to_column(., var="GeneID")
      })
}

Plots

Log2 SD x Mean

plot_jackknife_log2sd_vs_mean(jack.resid.sd.summary, filtered.samples$metadata)

Log2 CV x Mean

plot_jackknife_log2cv_vs_mean(jack.resid.cv.summary, filtered.samples$metadata)

Adjusted SD x Mean

plot_jackknife_adj_sd_vs_mean(jack.adj.sd.summary, filtered.samples$metadata)

Residual log2 SD x Mean

plot_jackknife_resid_var_vs_mean(jack.resid.sd.summary, filtered.samples$metadata, method="sd")

Residual log2 CV x Mean

plot_jackknife_resid_var_vs_mean(jack.resid.cv.summary, filtered.samples$metadata, method="cv")

Global variability rank - based on residual log2 CV

# Based on percentile rank of residual log2(CV^2) across the range of expression levels
plot_jackknife_global_var_rank_vs_mean(jack.resid.cv.summary, filtered.samples$metadata, method="cv")

Local variability rank - based on residual log2 CV

# Based on mean percentile rank of residual log2(CV^2) within sliding windows of genes with similar expression levels
plot_jackknife_local_var_rank_vs_mean(jack.resid.cv.summary, filtered.samples$metadata, method="cv")

Protein-coding vs lncRNA

Split by gene biotype

jack.resid.cv.biotype <- list()

if ("Sex" %in% colnames(filtered.samples$metadata)) {
  jack.resid.cv.biotype$coding <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        jack.resid.cv[[t]][[s]]$summary[
          rownames(jack.resid.cv[[t]][[s]]$summary) %in%
            biotype$Gene.stable.ID[biotype$Gene.type=="protein_coding"],]
    })
  })
  
  jack.resid.cv.biotype$lncrna <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        jack.resid.cv[[t]][[s]]$summary[
          rownames(jack.resid.cv[[t]][[s]]$summary) %in% 
            biotype$Gene.stable.ID[biotype$Gene.type %in% c("lncRNA", "lincRNA")],]
    })
  })  
} else {
  jack.resid.cv.biotype$coding <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.resid.cv[[t]]$summary[
        rownames(jack.resid.cv[[t]]$summary) %in%
          biotype$Gene.stable.ID[biotype$Gene.type=="protein_coding"],]
  })    

  jack.resid.cv.biotype$lncrna <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.resid.cv[[t]]$summary[
        rownames(jack.resid.cv[[t]]$summary) %in%
          biotype$Gene.stable.ID[biotype$Gene.type %in% c("lncRNA", "lincRNA")],]
  })   
}

Bind jackknife results across all conditions

if ("Sex" %in% colnames(filtered.samples$metadata)) { 
  jack.resid.cv.bind <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        if (!is.null(jack.resid.cv[[t]][[s]])) { 
          jack.resid.cv[[t]][[s]]$summary %>%
            tibble::rownames_to_column(., var="GeneID")
          }
        }) %>%
        dplyr::bind_rows(.id="Sex")
      }) %>%
    dplyr::bind_rows(.id="Tissue")
} else { 
  jack.resid.cv.bind <- 
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      jack.resid.cv[[t]]$summary %>%
        tibble::rownames_to_column(., var="GeneID")
      }) %>%
    dplyr::bind_rows(.id="Tissue")
} 

jack.resid.cv.bind <- jack.resid.cv.bind %>%
  dplyr::inner_join(biotype[,-1], by=c("GeneID"="Gene.stable.ID"))
colnames(jack.resid.cv.bind)[ncol(jack.resid.cv.bind)] <- "Biotype"

jack.resid.cv.bind <- 
  jack.resid.cv.bind[jack.resid.cv.bind$Biotype %in% c("protein_coding", "lncRNA", "lincRNA"),]

Compare variability rank distribution of protein-coding genes and lncRNA

if ("Sex" %in% colnames(filtered.samples$metadata)) {
  boxplot_var_rank_lncrna_vs_coding(jack.resid.cv.bind, sex=TRUE, method="cv")  
} else {
  boxplot_var_rank_lncrna_vs_coding(jack.resid.cv.bind, sex=FALSE, method="cv")
}
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Save

if (!dir.exists(params$output.rds.path)) { dir.create(params$output.rds.path, showWarnings=FALSE) }

if (!params$all.nonzero.matrix) {
  saveRDS(jack.resid.cv.bind,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.bind.rds", sep="_")))
  
  saveRDS(jack.resid.cv.summary,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.summary.rds", sep="_")))
  
  saveRDS(jack.resid.cv.biotype,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.biotype.rds", sep="_")))  
} else {
  saveRDS(jack.resid.cv.bind,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.bind.nonzero.rds", sep="_")))
  
  saveRDS(jack.resid.cv.summary,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.summary.nonzero.rds", sep="_")))
  
  saveRDS(jack.resid.cv.biotype,
          file.path(params$output.rds.path, 
                    paste(params$species, "jack.resid.cv.biotype.nonzero.rds", sep="_")))   
}